# code to remove objects in Environment before knitting
rm(list = ls())
books <- read.csv(here::here("datasets", "books.csv"))
books_two <- read.csv(here::here("datasets", "final_dataset.csv"))
# ---------------------------------------
library('yardstick')
# ---------------------------------------
# ---------------------------------------
# data visualization
# --------------------------------------
library('ggplot2')
library('plotly')
library('gganimate')
library('ggridges')
# ---------------------------------------
# data manipulation
# --------------------------------------
library('forcats')
library('tidyverse')
library('magrittr')
library('lubridate')
library('dplyr')
library('DT')
#install.packages("formattable")
#install.packages("tidyr")
library('formattable')
library('tidyr')
library('data.table')
library('kableExtra')
# ---------------------------------------
# sentiment analysis
# --------------------------------------
library('sentimentr')
# ---------------------------------------
# summary statistics
# --------------------------------------
#install.packages("qwraps2")
library("qwraps2")
# ---------------------------------------
# model validation library
# ---------------------------------------
library('rsample')
# ---------------------------------------
# generalized linear model libraries
# ---------------------------------------
library('glmnet')
library('glmnetUtils')
# ---------------------------------------
# regression output
# ---------------------------------------
# install.packages('sjPlot')
library('sjPlot')
# install.packages('sjPlot')
library('tidymodels')
# ---------------------------------------
# random forest libraries
# ---------------------------------------
library('partykit')
#library('tidyverse')
library('PerformanceAnalytics')
library('rpart')
library('rpart.plot')
library('randomForest')
#install.packages("randomForestExplainer")
library('randomForestExplainer')
# ---------------------------------------
# lasso libraries
# ---------------------------------------
library('broom')
library('coefplot')
# ---------------------------------------
books <- books %>% rename(avg_book_rating = average_rating,
book_ratings_count = ratings_count,
author = authors)
books_two <- books_two %>% rename(author = name,
authorworkcount = workcount,
author_fans = fan_count,
avg_author_rating = average_rate,
author_ratings_count = rating_count,
author_review_count = review_count,
)
Need ‘sentimentr’ library.
sentiment_DF <- get_sentences(books$title) %>% sentiment_by(books$title)
head(sentiment_DF)
## title
## 1:
## 2: said the shotgun to the head.
## 3: $30 Film School: How to Write Direct Produce Shoot Edit Distribute Tour With and Sell Your Own No-Budget Digital Movie
## 4: 'Salem's Lot
## 5: 1 000 Places to See Before You Die
## 6: 10 lb Penalty
## word_count sd ave_sentiment
## 1: 0 0 0.0000000
## 2: 6 NA -0.1632993
## 3: 20 NA -0.1118034
## 4: 16 0 0.0000000
## 5: 6 NA -0.3061862
## 6: 2 NA -0.5303301
# mutate to correct column data types
books_1 <- books_sa %>% mutate(num_pages = as.numeric(num_pages),
avg_book_rating = as.numeric(avg_book_rating),
text_reviews_count = as.numeric(text_reviews_count),
publication_date = as.Date(publication_date, format="%m/%d/%Y"),
born = as.Date(born, format="%m/%d/%Y"),
died = as.Date(died, format="%m/%d/%Y"),
gender = as.factor(gender)
)
# remove NAs
books_total <- books_1 %>%
filter(
(!is.na(avg_book_rating)), (!is.na(book_ratings_count)), (!is.na(text_reviews_count)), (!is.na(publication_date)),
(!duplicated(title)),
(avg_book_rating != 0),
(author != "NOT A BOOK"),
(!is_greater_than(num_pages, 2000)),
(num_pages != 0),
(bookID != 9796),
(!is_less_than(num_pages, 10))
)
# remove irrelevant variables (11):
# sd(standard deviation of words in title), author ID, image_URL, about, influence, website, twitter, original hometown, country, latitude, longitude
books_corti <- books_total %>% select(-isbn13,
-sd,
-authorid,
-image_url,
-about,
-influence,
-website,
-twitter,
-original_hometown,
-country,
-latitude,
-longitude) %>% rename(
title_sentiment_avg = ave_sentiment,
title_word_count = word_count
)
# View(books_corti)
# NA VISUALIZATION
# to see the number of missing values in each column
# STEPS:
# 1) We need to sum through every column using a FOR loop.
# 2) Then print the variable name using names(movies[i]).
# 3) Finally, we print the sum of is.na() for just that variable.
# FOR loop to see each column in books data set
for(i in 1:ncol(books_corti)){
# print the following
print(
# first print "Variable: "
paste0("Variable: ",
# then print the variable name, then "NAs: "
names(books_corti)[i], " NAs: ",
# then print the sum of the number of missing values
# for that variable
sum(is.na(books_corti %>% select(i)))
)
)
}
## [1] "Variable: bookID NAs: 0"
## [1] "Variable: title NAs: 0"
## [1] "Variable: author NAs: 0"
## [1] "Variable: avg_book_rating NAs: 0"
## [1] "Variable: isbn NAs: 0"
## [1] "Variable: language_code NAs: 0"
## [1] "Variable: num_pages NAs: 0"
## [1] "Variable: book_ratings_count NAs: 0"
## [1] "Variable: text_reviews_count NAs: 0"
## [1] "Variable: publication_date NAs: 0"
## [1] "Variable: publisher NAs: 0"
## [1] "Variable: title_word_count NAs: 0"
## [1] "Variable: title_sentiment_avg NAs: 0"
## [1] "Variable: authorworkcount NAs: 0"
## [1] "Variable: author_fans NAs: 0"
## [1] "Variable: gender NAs: 0"
## [1] "Variable: born NAs: 5914"
## [1] "Variable: died NAs: 5914"
## [1] "Variable: avg_author_rating NAs: 0"
## [1] "Variable: author_ratings_count NAs: 0"
## [1] "Variable: author_review_count NAs: 0"
## [1] "Variable: genre NAs: 0"
# starts_with() function for certain columns...
books_corti %>% select(starts_with("isbn")) %>% glimpse()
## Rows: 5,914
## Columns: 1
## $ isbn <chr> "0439554896", "0517226952", "0345453743", "1400052920", "0517149…
# exploring first 10 rows using slice() function
explore_data <- books_corti %>% arrange(desc(avg_book_rating)) %>% slice(1:10) %>% select(title, author, avg_book_rating)
print(explore_data)
## title
## 1 Zone of the Enders: The 2nd Runner Official Strategy Guide
## 2 The Diamond Color Meditation: Color Pathway to the Soul
## 3 Taxation of Mineral Rents
## 4 The Irish Anatomist: A Study of Flann O'Brien
## 5 His Princess Devotional: A Royal Encounter With Your King
## 6 Stargirl LitPlans on CD
## 7 The Complete Calvin and Hobbes
## 8 Wissenschaft der Logik: Die Lehre Vom Begriff (1816)
## 9 It's a Magical World (Calvin and Hobbes #11)
## 10 Homicidal Psycho Jungle Cat (Calvin and Hobbes #9)
## author avg_book_rating
## 1 Tim Bogenn 5.00
## 2 John Diamond 5.00
## 3 Ross Garnaut 5.00
## 4 Keith Donohue 5.00
## 5 Sheri Rose Shepherd 5.00
## 6 Mary B. Collins 4.86
## 7 Bill Watterson 4.82
## 8 Georg Wilhelm Friedrich Hegel 4.78
## 9 Bill Watterson 4.76
## 10 Bill Watterson 4.72
datatable(books_corti)
Linear regression analysis is sensitive to outliers. Use histogram to see where this will occur.
ggplot(books_corti, aes(x = avg_book_rating)) +
xlab("Average Book Rating") +
ylab("Count") +
geom_histogram(fill = "skyblue", color = "#879bcd") +
theme_dark(base_size = 18) +
ggtitle(" Histogram to View Outliers")
p <- books_corti %>%
ggplot(aes(avg_book_rating, title_sentiment_avg)) +
xlab("Average Book Rating") + ylab("Title Sentiment") +
geom_point(color = "skyblue", alpha = 1/2, size = 0.5) +
theme_bw(base_size = 18) +
ggtitle("Exploring the Data: Visualization 1")
ggplotly(p)
p <- ggplot(books_corti %>%
mutate(genderMutated = fct_lump(gender, n = 10)),
aes(x = avg_book_rating, y = genderMutated, fill = genderMutated)) +
theme_minimal(base_size = 18) +
geom_density_ridges(color="black") +
xlab("Average Book Rating") +
ylab("Gender of Author") +
ggtitle(" Exploring the Data: Visualization 2")
p + theme(legend.position = "none")
str(books_corti)
## 'data.frame': 5914 obs. of 22 variables:
## $ bookID : chr "4" "12" "13" "14" ...
## $ title : chr "Harry Potter and the Chamber of Secrets (Harry Potter #2)" "The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy #1-5)" "The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1-5)" "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1)" ...
## $ author : chr "J.K. Rowling" "Douglas Adams" "Douglas Adams" "Douglas Adams" ...
## $ avg_book_rating : num 4.42 4.38 4.38 4.22 4.38 4.21 3.44 3.87 4.07 3.9 ...
## $ isbn : chr "0439554896" "0517226952" "0345453743" "1400052920" ...
## $ language_code : chr "eng" "eng" "eng" "eng" ...
## $ num_pages : num 352 815 815 215 815 544 55 256 335 304 ...
## $ book_ratings_count : int 6333 3628 249558 4930 2877 248558 7270 2088 72451 49240 ...
## $ text_reviews_count : num 244 254 4080 460 195 ...
## $ publication_date : Date, format: "2003-11-01" "2005-11-01" ...
## $ publisher : chr "Scholastic" "Gramercy Books" "Del Rey Books" "Crown" ...
## $ title_word_count : int 18 15 12 44 9 12 8 12 4 14 ...
## $ title_sentiment_avg : num 0 0.31 0.346 0.362 0.4 ...
## $ authorworkcount : int 242 103 103 103 103 59 59 59 59 59 ...
## $ author_fans : int 209174 19029 19029 19029 19029 14356 14356 14356 14356 14356 ...
## $ gender : Factor w/ 3 levels "female","male",..: 1 2 2 2 2 2 2 2 2 2 ...
## $ born : Date, format: NA NA ...
## $ died : Date, format: NA NA ...
## $ avg_author_rating : num 4.46 4.2 4.2 4.2 4.2 4.03 4.03 4.03 4.03 4.03 ...
## $ author_ratings_count: int 24511114 2624222 2624222 2624222 2624222 1272002 1272002 1272002 1272002 1272002 ...
## $ author_review_count : int 579250 57565 57565 57565 57565 76846 76846 76846 76846 76846 ...
## $ genre : chr "fantasy,fiction,young adult" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" ...
options(qwraps2_markup = "markdown")
view(books_corti)
our_summary1 <-
list("Average Book Rating" =
list("min" = ~ min(avg_book_rating),
"mean" = ~ mean(avg_book_rating),
"max" = ~ max(avg_book_rating),
"st. dev" = ~ sd(avg_book_rating)),
"Number of Pages" =
list("min" = ~ min(num_pages),
"mean" = ~ mean(num_pages),
"max" = ~ max(num_pages),
"st.dev" = ~ sd(num_pages)),
"Book Ratings Count" =
list("min" = ~ min(book_ratings_count),
"mean" = ~ mean(book_ratings_count),
"max" = ~ max(book_ratings_count),
"st. dev" = ~ sd(book_ratings_count)),
"Text Reviews Count" =
list("min" = ~ min(text_reviews_count),
"mean" = ~ mean(text_reviews_count),
"max" = ~ max(text_reviews_count),
"st. dev" = ~ sd(text_reviews_count)),
"Average Title Sentiment Score" =
list("min" = ~ min(title_sentiment_avg),
"mean" = ~ mean(title_sentiment_avg),
"max" = ~ max(title_sentiment_avg),
"st. dev" = ~ sd(title_sentiment_avg)),
"Author's Work Count" =
list("min" = ~ min(authorworkcount),
"mean" = ~ mean(authorworkcount),
"max" = ~ max(authorworkcount),
"st. dev" = ~ sd(authorworkcount)),
"Author's Fan Count" =
list("min" = ~ min(author_fans),
"mean" = ~ mean(author_fans),
"max" = ~ max(author_fans),
"st. dev" = ~ sd(author_fans)),
"Author Ratings Count" =
list("min" = ~ min(author_ratings_count),
"mean" = ~ mean(author_ratings_count),
"max" = ~ max(author_ratings_count),
"st. dev" = ~ sd(author_ratings_count)),
"Author Review Count" =
list("min" = ~ min(author_review_count),
"mean" = ~ mean(author_review_count),
"max" = ~ max(author_review_count),
"st. dev" = ~ sd(author_review_count))
)
sum_stats <- summary_table(books_corti, our_summary1) %>% round(1)
print(sum_stats)
##
##
## | |books_corti (N = 5,914) |
## |:---------------------------------|:-----------------------|
## |**Average Book Rating** | |
## | min |1 |
## | mean |3.9 |
## | max |5 |
## | st. dev |0.3 |
## |**Number of Pages** | |
## | min |10 |
## | mean |350.4 |
## | max |1952 |
## | st.dev |195.2 |
## |**Book Ratings Count** | |
## | min |0 |
## | mean |21480.8 |
## | max |4597666 |
## | st. dev |120423.5 |
## |**Text Reviews Count** | |
## | min |0 |
## | mean |696.7 |
## | max |94265 |
## | st. dev |2844.8 |
## |**Average Title Sentiment Score** | |
## | min |-1.4 |
## | mean |0 |
## | max |1.3 |
## | st. dev |0.3 |
## |**Author's Work Count** | |
## | min |1 |
## | mean |231 |
## | max |5204 |
## | st. dev |488.1 |
## |**Author's Fan Count** | |
## | min |0 |
## | mean |12050 |
## | max |709826 |
## | st. dev |55558.6 |
## |**Author Ratings Count** | |
## | min |27 |
## | mean |658708 |
## | max |24511114 |
## | st. dev |1727055.4 |
## |**Author Review Count** | |
## | min |1 |
## | mean |26511.1 |
## | max |579250 |
## | st. dev |58813.9 |
Need to load ‘rsample’ library here.
set.seed(1818)
train_prop <- 0.8
books_split <- initial_split(books_corti, prop = train_prop)
books_train <- training(books_split)
books_test <- testing(books_split)
nrow(books_train)
## [1] 4732
nrow(books_test)
## [1] 1182
Need ‘dplyr’, ‘glmnet’, and ‘glmnetUtils’ libraries here.
options(scipen = 999)
mod1 <- lm(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender, data = books_train)
summary(mod1)
##
## Call:
## lm(formula = avg_book_rating ~ num_pages + book_ratings_count +
## text_reviews_count + title_sentiment_avg + authorworkcount +
## author_fans + author_ratings_count + author_review_count +
## gender, data = books_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8325 -0.1575 0.0139 0.1787 1.1391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.79675179461 0.01058977825 358.530 < 0.0000000000000002
## num_pages 0.00027651722 0.00002059907 13.424 < 0.0000000000000002
## book_ratings_count -0.00000005966 0.00000007126 -0.837 0.402464
## text_reviews_count 0.00000817665 0.00000320658 2.550 0.010805
## title_sentiment_avg 0.04097254653 0.01483542542 2.762 0.005771
## authorworkcount 0.00003276625 0.00000871417 3.760 0.000172
## author_fans 0.00000013706 0.00000013538 1.012 0.311392
## author_ratings_count 0.00000006167 0.00000000699 8.823 < 0.0000000000000002
## author_review_count -0.00000172266 0.00000025130 -6.855 0.00000000000804
## gendermale 0.03361376024 0.00913831936 3.678 0.000237
## genderunknown 0.01812663016 0.01336993658 1.356 0.175236
##
## (Intercept) ***
## num_pages ***
## book_ratings_count
## text_reviews_count *
## title_sentiment_avg **
## authorworkcount ***
## author_fans
## author_ratings_count ***
## author_review_count ***
## gendermale ***
## genderunknown
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.275 on 4721 degrees of freedom
## Multiple R-squared: 0.06875, Adjusted R-squared: 0.06678
## F-statistic: 34.85 on 10 and 4721 DF, p-value: < 0.00000000000000022
#——————————————————– # estimating “prettier” regression output #——————————————————–
Need ‘sjPlot’ and ‘tidymodels’ libraries.
#——————————————————– tab_model() outputs a table of results #——————————————————–
tab_model(mod1, digits = 3)
| avg book rating | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 3.797 | 3.776 – 3.818 | <0.001 |
| num_pages | 0.000 | 0.000 – 0.000 | <0.001 |
| book_ratings_count | -0.000 | -0.000 – 0.000 | 0.402 |
| text_reviews_count | 0.000 | 0.000 – 0.000 | 0.011 |
| title_sentiment_avg | 0.041 | 0.012 – 0.070 | 0.006 |
| authorworkcount | 0.000 | 0.000 – 0.000 | <0.001 |
| author_fans | 0.000 | -0.000 – 0.000 | 0.311 |
| author_ratings_count | 0.000 | 0.000 – 0.000 | <0.001 |
| author_review_count | -0.000 | -0.000 – -0.000 | <0.001 |
| gender [male] | 0.034 | 0.016 – 0.052 | <0.001 |
| gender [unknown] | 0.018 | -0.008 – 0.044 | 0.175 |
| Observations | 4732 | ||
| R2 / R2 adjusted | 0.069 / 0.067 | ||
#——————————————————– plot_model() outputs a plot of regression coefficients #——————————————————–
plot_model(mod1)+ ylim(-0.1,0.1) + ggtitle(" Average Book Rating Coefficients") + theme_minimal(base_size = 16)
#——————————————————– tidy() outputs a table of coefficients and their p-values, t-stats #——————————————————–
tidy(mod1)
## # A tibble: 11 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.80 0.0106 359. 0.
## 2 num_pages 0.000277 0.0000206 13.4 2.39e-40
## 3 book_ratings_count -0.0000000597 0.0000000713 -0.837 4.02e- 1
## 4 text_reviews_count 0.00000818 0.00000321 2.55 1.08e- 2
## 5 title_sentiment_avg 0.0410 0.0148 2.76 5.77e- 3
## 6 authorworkcount 0.0000328 0.00000871 3.76 1.72e- 4
## 7 author_fans 0.000000137 0.000000135 1.01 3.11e- 1
## 8 author_ratings_count 0.0000000617 0.00000000699 8.82 1.55e-18
## 9 author_review_count -0.00000172 0.000000251 -6.86 8.04e-12
## 10 gendermale 0.0336 0.00914 3.68 2.37e- 4
## 11 genderunknown 0.0181 0.0134 1.36 1.75e- 1
Note: We used an alpha sequence from 0 to 1 in steps of 0.1.
enet_mod <- cva.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
data = books_train,
alpha = seq(0,1, by = 0.1))
print(enet_mod)
## Call:
## cva.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count +
## text_reviews_count + title_sentiment_avg + authorworkcount +
## author_fans + author_ratings_count + author_review_count +
## gender, data = books_train, alpha = seq(0, 1, by = 0.1))
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Alpha values: 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
## Number of crossvalidation folds for lambda: 10
plot(enet_mod)
minlossplot(enet_mod,
cv.type = "min")
# Use this function to find the best alpha.
get_alpha <- function(fit) {
alpha <- fit$alpha
error <- sapply(fit$modlist,
function(mod) {min(mod$cvm)})
alpha[which.min(error)]
}
# Get all parameters.
get_model_params <- function(fit) {
alpha <- fit$alpha
lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
best <- which.min(error)
data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
lambdaSE = lambdaSE[best], error = error[best])
}
# Extract the best alpha value & model parameters.
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
## [1] 1
get_model_params(enet_mod)
## alpha lambdaMin lambdaSE error
## 1 1 0.0001936006 0.01273763 0.07581943
# Extract the best model object.
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]
enet_best_mod <- cv.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
data = books_train,
alpha = 0.1)
summary(enet_best_mod)
## Length Class Mode
## lambda 92 -none- numeric
## cvm 92 -none- numeric
## cvsd 92 -none- numeric
## cvup 92 -none- numeric
## cvlo 92 -none- numeric
## nzero 92 -none- numeric
## call 4 -none- call
## name 1 -none- character
## glmnet.fit 12 elnet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
## terms 2 -none- call
## xlev 9 -none- list
## alpha 1 -none- numeric
## nfolds 1 -none- numeric
## sparse 1 -none- logical
## use.model.frame 1 -none- logical
## na.action 1 -none- character
print(enet_best_mod)
## Call:
## cv.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count +
## text_reviews_count + title_sentiment_avg + authorworkcount +
## author_fans + author_ratings_count + author_review_count +
## gender, data = books_train, alpha = 0.1)
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Number of crossvalidation folds: 10
## Alpha: 0.1
## Deviance-minimizing lambda: 0.0001430848 (+1 SE): 0.3229453
Print the model’s two suggested values of lambda.
print(enet_best_mod$lambda.min)
## [1] 0.0001430848
print(enet_best_mod$lambda.1se)
## [1] 0.3229453
Plot how the MSE varies as we vary lambda.
plot(enet_best_mod)
coefpath(enet_best_mod)
Compare lambda min & lambda 1SE…
# put into coefficient vector
enet_coefs <- data.frame(
`lasso_min` = coef(enet_best_mod, s = enet_best_mod$lambda.min) %>%
as.matrix() %>% data.frame() %>% round(3),
`lasso_1se` = coef(enet_best_mod, s = enet_best_mod$lambda.1se) %>%
as.matrix() %>% data.frame() %>% round(3)
) %>% rename(`lasso_min` = 1, `lasso_1se` = 2)
print(enet_coefs)
## lasso_min lasso_1se
## (Intercept) 3.815 3.903
## num_pages 0.000 0.000
## book_ratings_count 0.000 0.000
## text_reviews_count 0.000 0.000
## title_sentiment_avg 0.041 0.000
## authorworkcount 0.000 0.000
## author_fans 0.000 0.000
## author_ratings_count 0.000 0.000
## author_review_count 0.000 0.000
## genderfemale -0.018 0.000
## gendermale 0.015 0.000
## genderunknown 0.000 0.000
enet_coefs %>% kable() %>% kable_styling()
| lasso_min | lasso_1se | |
|---|---|---|
| (Intercept) | 3.815 | 3.903 |
| num_pages | 0.000 | 0.000 |
| book_ratings_count | 0.000 | 0.000 |
| text_reviews_count | 0.000 | 0.000 |
| title_sentiment_avg | 0.041 | 0.000 |
| authorworkcount | 0.000 | 0.000 |
| author_fans | 0.000 | 0.000 |
| author_ratings_count | 0.000 | 0.000 |
| author_review_count | 0.000 | 0.000 |
| genderfemale | -0.018 | 0.000 |
| gendermale | 0.015 | 0.000 |
| genderunknown | 0.000 | 0.000 |
Need ‘partykit’, ‘PerformanceAnalytics’, ‘rpart’, ‘rpart.plot’, and ‘randomForest’ libraries.
options(scipen = 10)
#set.seed(1818)
# store row names as columns
books_boot_preds <- books_corti %>% rownames_to_column() %>%
mutate(rowname = as.numeric(rowname))
B <- 100 # number of bootstrap samples
num_b <- 500 # sample size of each bootstrap
boot_mods <- list() # store our bagging models
for(i in 1:B){
boot_idx <- sample(1:nrow(books_corti),
size = num_b,
replace = FALSE)
# fit a tree on each bootstrap sample
boot_tree <- ctree(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count+ gender,
data = books_corti %>%
slice(boot_idx))
# store bootstraped model
boot_mods[[i]] <- boot_tree
# generate predictions for that bootstrap model
preds_boot <- data.frame(
preds_boot = predict(boot_tree),
rowname = boot_idx
)
# rename prediction to indicate which boot iteration it came from
names(preds_boot)[1] <- paste("preds_boot",i,sep = "")
# merge predictions to dataset
books_boot_preds <- left_join(x = books_boot_preds, y = preds_boot,
by = "rowname")
}
<<<<<<< HEAD
======= >>>>>>> 00e659f22bd349df6028aba4b394c022ab5889c3 #——————————————————– plot() examines an individual model from bagging #——————————————————–
plot(boot_mods[[1]], gp = gpar(fontsize = 8))
books_boot_preds <- books_boot_preds %>%
mutate(preds_bag =
select(., preds_boot1:preds_boot100) %>%
rowMeans(na.rm = TRUE))
# NOTE: At this point in the code, the model has been bootstrapped.
rf_fit <- randomForest(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
data = books_corti,
type = regression,
mtry = 11/3,
ntree = 200,
importance = TRUE)
print(rf_fit)
##
## Call:
## randomForest(formula = avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender, data = books_corti, type = regression, mtry = 11/3, ntree = 200, importance = TRUE)
## Type of random forest: regression
## Number of trees: 200
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.06013366
## % Var explained: 24.51
plot(rf_fit)
varImpPlot(rf_fit, type = 1)
plot_min_depth_distribution(rf_fit)
plot_predict_interaction(rf_fit, books_corti, "author_ratings_count", "num_pages")
plot_predict_interaction(rf_fit, books_corti, "authorworkcount", "num_pages")
plot_predict_interaction(rf_fit, books_corti, "num_pages", "title_sentiment_avg")
Storing predictions data frames for Linear and ElasticNet models…
lm_preds_train <- predict(mod1, newdata = books_train)
lm_preds_test <- predict(mod1,
newdata = books_test)
enet_preds_train <- predict(enet_best_mod,
newdata = books_train, s = "lambda.min")
enet_preds_test <- predict(enet_best_mod,
newdata = books_test, s = "lambda.min")
head(lm_preds_train)
## 1 2 3 4 5 6
## 4.446006 4.138931 4.157041 3.976757 4.142188 3.992752
head(lm_preds_test)
## 10 20 21 38 39 43
## 3.874056 3.856204 3.880741 3.859920 3.838027 3.983870
head(enet_preds_train)
## 1
## 1 4.439726
## 2 4.137918
## 3 4.156343
## 4 3.975785
## 5 4.141186
## 6 3.993360
head(enet_preds_test)
## 1
## 10 3.875319
## 20 3.856850
## 21 3.880675
## 38 3.859698
## 39 3.837804
## 43 3.983638
Storing results data frames for Linear and ElasticNet models…
training_predictions <- data.frame(lm_preds_train, enet_preds_train)
results_train <- data.frame(books_train, training_predictions) %>% rename(enet_training = X1)
testing_predictions <- data.frame(
"lm_testing" = lm_preds_test,
"enet_testing" = enet_preds_test)
results_test <- data.frame(books_test, testing_predictions) %>% rename(enet_testing = X1)
ggplot(results_train, aes(x = avg_book_rating, y = lm_preds_train)) +
geom_point(alpha = 1/10, size = 4) +
theme_minimal(base_size = 16)+
geom_abline(color = "turquoise")+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Linear Regression: Training True vs Predicted")
ggplot(results_train, aes(x = avg_book_rating, y = enet_preds_train)) +
geom_point(alpha = 1/10, size = 4) +
theme_minimal(base_size = 16)+
geom_abline(color = "turquoise")+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Best ElasticNet: Training True vs Predicted")
ggplot(results_test, aes(x = avg_book_rating, y = lm_preds_test)) +
geom_point(alpha = 1/10, size = 4) +
geom_abline(color = "coral")+
theme_minimal(base_size = 16)+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Linear Regression: Testing True vs Predicted")
ggplot(results_test, aes(x = avg_book_rating, y = enet_preds_test)) +
geom_point(alpha = 1/10, size = 4) +
geom_abline(color = "coral")+
theme_minimal(base_size = 16)+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Best ElasticNet: Testing True vs Predicted")
#——————————————————– # 21) MODEL EVALUATION #——————————————————–
rmse(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.275
mae(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 0.209
rsq(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0687
lm_rmse <- rmse(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_mae <- mae(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_rsq <- rsq(books_test, truth = avg_book_rating, estimate = lm_preds_test)
rmse(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.275
mae(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 0.209
rsq(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0687
enet_rmse <- rmse(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_mae <- mae(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_rsq <- rsq(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
Tree OUT-OF-BAG Predictions…
books_right_join <- right_join(books_corti, books_boot_preds)
## Joining, by = c("bookID", "title", "author", "avg_book_rating", "isbn", "language_code", "num_pages", "book_ratings_count", "text_reviews_count", "publication_date", "publisher", "title_word_count", "title_sentiment_avg", "authorworkcount", "author_fans", "gender", "born", "died", "avg_author_rating", "author_ratings_count", "author_review_count", "genre")
books_right_join <- books_right_join %>% ungroup()
tree_rmse <- rmse(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_mae <- mae(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_rsq <- rsq(books_right_join, truth = avg_book_rating, estimate = preds_bag)
Random Forest OUT-OF-BAG Predictions…
preds_OOB <- predict(rf_fit)
rf_rsq <- rsq(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_rmse <- rmse(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_mae <- mae(books_corti, truth = avg_book_rating, estimate = preds_OOB)
All testing data predictions…
rsq_DF <- merge(rf_rsq, enet_rsq, by=c(".metric", ".estimator"))
rsq_DF1 <- merge(rsq_DF, lm_rsq, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate)
rsq_DF2 <- merge(rsq_DF1, tree_rsq, by=c(".metric", ".estimator")) %>% select(-.estimator)
print(rsq_DF2)
## .metric Random Forest ElasticNet Linear .estimate
## 1 rsq 0.24535 0.06579101 0.06588606 0.1084553
rmse_DF <- merge(rf_rmse, enet_rmse, by=c(".metric", ".estimator"))
rmse_DF1 <- merge(rmse_DF, lm_rmse, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate)
rmse_DF2 <- merge(rmse_DF1, tree_rmse, by=c(".metric", ".estimator")) %>% select(-.estimator)
print(rmse_DF2)
## .metric Random Forest ElasticNet Linear .estimate
## 1 rmse 0.2452216 0.2633069 0.2633021 0.2692242
mae_DF <- merge(rf_mae, enet_mae, by=c(".metric", ".estimator"))
mae_DF1 <- merge(mae_DF, lm_mae, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate)
mae_DF2 <- merge(mae_DF1, tree_mae, by=c(".metric", ".estimator")) %>% select(-.estimator)
print(mae_DF2)
## .metric Random Forest ElasticNet Linear .estimate
## 1 mae 0.1795154 0.2016244 0.2016341 0.2051364
total <- rbind(rsq_DF2, rmse_DF2)
final <-rbind(total, mae_DF2) %>% rename("Tree" = .estimate, "Metrics" = .metric)
print(final)
## Metrics Random Forest ElasticNet Linear Tree
## 1 rsq 0.2453500 0.06579101 0.06588606 0.1084553
## 2 rmse 0.2452216 0.26330694 0.26330206 0.2692242
## 3 mae 0.1795154 0.20162440 0.20163410 0.2051364
Credit for the code below: https://rfortherestofus.com/2019/11/how-to-make-beautiful-tables-in-r/
Need to load ‘kableExtra’ library.
final %>% kable() %>% kable_styling()
| Metrics | Random Forest | ElasticNet | Linear | Tree |
|---|---|---|---|---|
| rsq | 0.2453500 | 0.0657910 | 0.0658861 | 0.1084553 |
| rmse | 0.2452216 | 0.2633069 | 0.2633021 | 0.2692242 |
| mae | 0.1795154 | 0.2016244 | 0.2016341 | 0.2051364 |
Credit for code below: https://www.littlemissdata.com/blog/prettytables
Need to load ‘formattable’, ‘tidyr’, and ‘data.table’ libraries.
custom_one = "#CCCCFF"
custom_two = "skyblue"
custom_three = "#4ec5a5"
custom_coral = "#FA7268"
# custom_green = "#00AD43"
formattable(final,
align =c("l","c","c","c","c", "c", "c", "c", "r"),
list(`Metrics` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold")),
`Random Forest`= color_tile(custom_one, custom_one),
`ElasticNet`= color_tile(custom_two, custom_two),
`Linear`= color_tile(custom_three, custom_three),
`Tree`= color_tile(custom_coral, custom_coral)
))
| Metrics | Random Forest | ElasticNet | Linear | Tree |
|---|---|---|---|---|
| rsq | 0.2453500 | 0.06579101 | 0.06588606 | 0.1084553 |
| rmse | 0.2452216 | 0.26330694 | 0.26330206 | 0.2692242 |
| mae | 0.1795154 | 0.20162440 | 0.20163410 | 0.2051364 |